Our project was motivated by the need to understand the demand for and the supply of social services in the United States of America. To narrow this down further, we restricted this study to the state of California, as there is relatively greater access to social services such as WIC and SNAP in this state. In FY 2021, SNAP helped 4.3 million CA residents or 11% of the population, and in FY 2021, WIC helped 950,000 CA residents.
Studies also indicate that many households in California struggle to put food on the table. This statistic indicates the need for further analysis of WIC and SNAP services in California.
Most recent data indicates that:
Policy Questions:
We used data from California Open Data portals, USDA.gov, and census data as well to conduct our analyses.
We used a combination of tools to answer our motivating policy questions. First, we began by conducting exploratory analyses, including the use of graphs and geospatial visualizations, to further analyze access to WIC and SNAP store locations in different counties in California. We were able to discern trends between different populations and identify the impact of development (urban or rural county) on access to social services. We then used supervised and unsupervised machine learning models (text analysis, cluster analysis, linear regression model) to predict store locations that were likely to have WIC or SNAP and number of vouchers redeemed across counties, based on date and number of families in the region.
The challenges we came across while coding this project were largely related to the data we had access to. We were unable to conduct analysis by race and gender, because of the way these factors were coded into the table. With more time and access to resources, we would love to analyze how women and non-binary folks access these social services. Comparing and contrasting these results in urban counties and rural counties will also be critical to policy making.
#loading and cleaning data in advance so that it can be used in a combination of plot below
library(tidycensus)
library(tidyverse)
library(ggplot2)
library(dplyr)
library(janitor)
library(sf)
library(stringr)
library(lubridate)
library(ggridges)
library(viridis)
library(hrbrthemes)
#1. SNAP Store Locations - "retailers"
retailers <- read_csv("SNAP_Store_Locations.csv")
retailers <- retailers %>%
janitor::clean_names() %>%
filter(state == "CA") %>%
mutate(COUNTY = toupper(county)) %>%
select(-county, -address_line_2)
# creating urban and rural counties
urban <- c("LOS ANGELES", "ORANGE", "SAN FRANCISCO", "SAN MATEO", "ALAMEDA", "CONTRA COSTA", "MARIN", "RIVERSIDE",
"SACRAMENTO", "SAN BERNARDINO", "SAN DIEGO", "SAN JOAQUIN", "SANTA CLARA", "VENTURA", "FRESNO")
retailers <- retailers %>%
mutate(urban= if_else(COUNTY %in% urban, 1, 0))
rural <-c("ALPINE", "AMADOR", "BUTTE", "CALAVERAS", "COLUSA", "DEL NORTE", "EL DORADO", "GLENN",
"HUMBOLDT", "IMPERIAL", "INYO", "LAKE", "LASSEN", "MADERA", "MARIPOSA", "MENDOCINO", "MERCED", "MODOC",
"MONO", "MONTEREY", "NAPA", "NEVADA", "PLACER", "PLUMAS", "SAN BENITO", "SAN LUIS OBISPO",
"SANTA BARBARA", "SHASTA", "SIERRA", "SISKIYOU", "SOLANO", "SONOMA", "SUTTER", "TEHAMA", "TRINITY",
"TULARE", "TUOLUMNE", "YOLO", "YUBA", "STANISLAUS", "KERN", "KINGS", "SANTA CRUZ")
retailers <- retailers %>%
mutate(rural= if_else(COUNTY %in% rural, 1, 0))
#prepping for visualizations & joining to census data
#creating ca_retailers
ca_retails <- retailers %>%
group_by(COUNTY) %>%
summarize(N_RETAIL = n()) %>%
ungroup()
#2. Census data + SNAP retailer data
county_pop_url <- "https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv"
county_pop_raw <- read_csv(url(county_pop_url))
ca_pop <- county_pop_raw %>%
filter(SUMLEV == "050") %>%
filter(STNAME == "California") %>%
select(
COUNTY = CTYNAME,
POPESTIMATE2019) %>%
mutate(COUNTY = toupper(COUNTY),
COUNTY = gsub("(.*)( COUNTY)", "\\1", COUNTY))
#combined data with retailers and pop data
combined_ca_data <- left_join(ca_retails, ca_pop, by = "COUNTY") %>%
mutate(retailers_per_100k = N_RETAIL / (POPESTIMATE2019/100000))
#save retailers_per_100k as an object
retailers_per_100k <- combined_ca_data %>%
mutate(retailers_per_100k = N_RETAIL / (POPESTIMATE2019/100000))
#3. SNAP participant demographic counties
snap_demos <- read_csv("particp_demog_counties.csv")
tidy_demos <- snap_demos %>%
clean_names() %>%
rename(year = fileyear) %>%
mutate(COUNTY = toupper(county)) %>%
select(year, COUNTY, person, cases, female, male, black, hispanic, asian_pi, native_american_other_unknown, white) %>%
filter(COUNTY != "COUNTY TOTAL")
## issue with this dataset is that it does not include # of cases by those demographics. it's only showing demographics for the county and how many cases there are.
tidy_demos <- tidy_demos %>%
mutate(rate = cases/person)
#4. Cal Fresh Data
calfresh <- read_csv("calfresh.csv")
tidy_calfresh <- calfresh %>%
janitor::clean_names() %>%
rename(COUNTY = county) %>%
mutate(COUNTY = toupper(COUNTY))
#5. WIC vendors
wic_vendors <- read_csv("vendor.csv")
wic_tidy <- wic_vendors %>%
mutate(COUNTY = toupper(COUNTY))
# WIC retailers data
wic_retailers <- wic_tidy %>%
group_by(COUNTY) %>%
summarize(N_RETAIL = n()) %>%
ungroup()
#6 WIC redemption
#by county
wic_redemp_county <- read_csv("wic_county.csv") %>%
clean_names()
#by particpant
wic_redemp_particp <- read_csv("wic_participant.csv") %>%
clean_names() %>%
separate(year_month, c("year", "month")) %>%
rename(COUNTY = vendor_location) %>%
filter(COUNTY != "STATEWIDE ANNUAL") %>%
select(-statewide_infant_formula_rebate, -total_cost_vouchers, -total_cost_vouchers_adjusted, -average_cost_adjusted,
-state_average_cost_adjusted)
wic_redemp_particp <- wic_redemp_particp %>%
mutate(across(starts_with("average"), ~gsub("\\$", "", .) %>% as.numeric)) %>%
mutate(month = as.numeric(month)) %>%
mutate(year = as.numeric(year))
wic_part <- wic_redemp_particp %>%
group_by(year, COUNTY, participant_category)
wic_part$number_of_participants_redeemed <- as.numeric(gsub("," , "", wic_part$number_of_participants_redeemed))
wic_part$number_vouchers_redeemed <- as.numeric(gsub("," , "", wic_part$number_vouchers_redeemed))
wic_binary <- wic_redemp_particp %>%
mutate(urban = if_else(COUNTY %in% urban, 1, 0))
wic_redemp_particp%>%
ggplot() +
geom_col(mapping = aes(participant_category, average_cost, fill = participant_category)) +
geom_text(
mapping = aes(participant_category, average_cost, label = average_cost, angle=90, hjust= -0.3),
stat = "identity",
parse = FALSE,
nudge_x = 0,
nudge_y = 0,
check_overlap = TRUE,
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE
) +
facet_wrap(~year) +
labs(
title = "Average Cost by WIC Participant",
x = "Type of Participant",
y = "Average Cost per Participant (in $)"
) +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank())
## Warning: Removed 1327 rows containing missing values (position_stack).
## Warning: Removed 1327 rows containing missing values (geom_text).
wic_time_series <- read_csv("participant_rates.csv") %>%
pivot_longer(cols=c("2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018"),
names_to = "year",
values_to = "rate")
## Rows: 5 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): participant_category
## dbl (9): 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
wic_time_series %>%
mutate(year = as.numeric(year)) %>%
ggplot(mapping = aes(x = year, y = rate, color = participant_category)) +
geom_line(alpha = 0.6, size = 1) +
labs(
title = "WIC Participation Rates by Type",
x = "Year",
y = "Rate")
#THIS IS BEAUTIFUL PLEASE LEAVE IT ALONE - See if we can adjust?
wic_redemp_particp %>%
ggplot(mapping = aes(x=participant_category, y=average_cost, fill = participant_category)) +
geom_violin() +
xlab("class") +
theme(legend.position = "none") +
xlab("") +
theme_minimal()
## Warning: Removed 1327 rows containing non-finite values (stat_ydensity).
#flat and pointy - why?
wic_binary %>%
ggplot(mapping = aes(x=participant_category, y=average_cost, fill = participant_category)) +
geom_violin() +
facet_wrap(~urban) +
xlab("class") +
theme(legend.position = "none") +
xlab("") +
theme_minimal() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
labs(
title = "Average Cost by Participant, 0=Rural and 1=Urban",
source = "California Open Data"
)
## Warning: Removed 1327 rows containing non-finite values (stat_ydensity).
library(tigris)
#creating ca_counties
ca_counties_raw <- tigris::counties(
state = "CA",
cb = TRUE,
resolution = "500k",
year = 2020,
class = "sf")
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================= | 34%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================ | 41%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|============================== | 44%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 49%
|
|=================================== | 50%
|
|=================================== | 51%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================= | 71%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 89%
|
|=============================================================== | 90%
|
|=============================================================== | 91%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 99%
|
|======================================================================| 100%
#getting geo outline of CA by county
ca_counties_raw %>%
ggplot() +
geom_sf() +
theme_void()
ca_counties <- ca_counties_raw %>%
dplyr::transmute(
GEOID,
MAP_NAME = NAME,
COUNTY = toupper(NAME)
)
#Geo spatial graph - Retailers per 100k People
ca_geospatial_data <- geo_join(
spatial_data = ca_counties,
data_frame = combined_ca_data,
by_sp = "COUNTY",
by_df = "COUNTY",
how = "left")
ggplot(ca_geospatial_data, aes(fill = retailers_per_100k)) +
geom_sf() +
#scale_fill_viridis_c() +
labs(
title = "Number of Retailers per 100,000 People by County in 2019"
) +
theme_void()
#Cal fresh geospatial analysis for child, adult, and elderly participation rates
join_calfresh <- tidy_calfresh %>%
select(COUNTY, year, elderly, adults, children, esl, total_population_cy, total_elderly_60plus_cy, total_children_under_18_cy, total_esl_over_age_5_cy) %>%
na.omit()
join_calfresh <- join_calfresh %>%
mutate(rate_elder = elderly/total_elderly_60plus_cy) %>%
mutate(rate_child = children/total_children_under_18_cy) %>%
mutate(rate_adult = adults/total_population_cy)
calfresh_map <- geo_join(
spatial_data = ca_counties,
data_frame = join_calfresh,
by_sp = "COUNTY",
by_df = "COUNTY",
how = "inner")
## Warning: We recommend using the dplyr::*_join() family of functions instead.
map_child <- ggplot(calfresh_map, aes(fill = rate_child)) +
geom_sf() +
scale_fill_viridis_c() +
theme_void()
map_adult <- ggplot(calfresh_map, aes(fill = rate_adult)) +
geom_sf() +
scale_fill_viridis_c() +
theme_void()
map_elder <- ggplot(calfresh_map, aes(fill = rate_elder)) +
geom_sf() +
scale_fill_viridis_c() +
theme_void()
library(patchwork)
map_child + map_adult + map_elder
#cal fresh map by year
calfresh_map %>%
ggplot(mapping = aes(fill = rate_child)) +
geom_sf() +
scale_fill_viridis_c() +
facet_wrap(~year) +
labs(title = "Child SNAP Participation Rate") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
calfresh_map %>%
ggplot(mapping = aes(fill = rate_elder)) +
geom_sf() +
scale_fill_viridis_c() +
facet_wrap(~year) +
labs(title = "Elderly SNAP Participation Rate") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
calfresh_map %>%
ggplot(mapping = aes(fill = rate_adult)) +
geom_sf() +
scale_fill_viridis_c() +
facet_wrap(~year) +
labs(title = "Adult SNAP Participation Rate") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
This overall rate graph shows that there is a large percentage of eligible folks across California who do not participate in SNAP despite being eligible for services. The ideal rate, of course, is 100% providing service to all who are eligible. This begs the question: what can California do better to increase the participation rate? Who is not being reached?
Future research could include data that provides participation rate of women and of women who are parents to one or more children.
# SNAP Map
snap_map <- geo_join(
spatial_data = ca_counties,
data_frame = tidy_demos,
by_sp = "COUNTY",
by_df = "COUNTY",
how = "inner")
## Warning: We recommend using the dplyr::*_join() family of functions instead.
snap_map %>%
ggplot(mapping = aes(fill = rate)) +
geom_sf() +
scale_fill_viridis_c() +
facet_wrap(~year) +
labs(title = "SNAP Participation Rate") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
#WIC retailers per 100K
wic_join <- left_join(wic_retailers, ca_pop, by = "COUNTY") %>%
mutate(retailers_per_100k = N_RETAIL / (POPESTIMATE2019/100000))
library(tigris)
wic_geo_data <- geo_join(
spatial_data = ca_counties,
data_frame = wic_join,
by_sp = "COUNTY",
by_df = "COUNTY",
how = "left")
## Warning: We recommend using the dplyr::*_join() family of functions instead.
ggplot(wic_geo_data, aes(fill = retailers_per_100k)) +
geom_sf() +
#scale_fill_viridis_c() +
labs(
title = "WIC Retailers per 100k"
) +
theme_void()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.2.0 ──
## ✔ broom 0.7.12 ✔ rsample 0.1.1
## ✔ dials 0.1.0 ✔ tune 0.2.0
## ✔ infer 1.0.0 ✔ workflows 0.2.6
## ✔ modeldata 0.1.1 ✔ workflowsets 0.2.1
## ✔ parsnip 0.2.1 ✔ yardstick 0.0.9
## ✔ recipes 0.2.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(rsample)
library(parsnip)
library(recipes)
library(workflows)
library(tune)
library(yardstick)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-3
#Data loading and cleaning
wic_models <- read_csv("wic_participant.csv") %>%
janitor::clean_names() %>%
filter(vendor_location != "STATEWIDE ANNUAL") %>%
filter(vendor_location != "STATEWIDE") %>%
select(-statewide_infant_formula_rebate, -total_cost_vouchers_adjusted, -average_cost_adjusted,
-state_average_cost_adjusted, -total_cost_vouchers, -participant_category, -vendor_location) %>%
separate(year_month, c("year", "month")) %>%
mutate(year = as.numeric(year)) %>%
mutate(month = as.numeric(month))
## Rows: 33851 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Vendor Location, Participant Category, Year Month, Number of Partic...
## lgl (4): Statewide Infant Formula Rebate, Total Cost Vouchers Adjusted, Aver...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
wic_models$number_of_participants_redeemed <- as.numeric(gsub("," , "",
wic_models$number_of_participants_redeemed))
wic_models$number_vouchers_redeemed <- as.numeric(gsub("," , "",
wic_models$number_vouchers_redeemed))
wic_models$average_cost = as.numeric(gsub("\\$", "", wic_models$average_cost))
#predict number of vouchers redeemed
#----------------------------------------
#linear regression
#initial split
set.seed(20211102)
split <- initial_split(wic_models, prop = 0.7, strata = "number_vouchers_redeemed")
wic_train <- training(split)
wic_test <- testing(split)
#recipe
wic_rec <- recipe(number_vouchers_redeemed ~., data = wic_train) %>%
step_nzv(all_predictors()) %>%
step_normalize(all_predictors())
# set up resampling using 10-fold cross validation
set.seed(20211102)
folds <- vfold_cv(data = wic_train, v = 10, repeats = 1)
# create a linear regression model
wic_mod <- linear_reg() %>%
set_engine("lm")
# create a workflow
wic_wf <- workflow() %>%
add_recipe(wic_rec) %>%
add_model(wic_mod)
# fit the model
wic_cv <- wic_wf %>%
fit_resamples(resamples = folds)
# select the best model
wic_best <- wic_cv %>%
select_best("rmse")
# finalize the workflow
wic_final <- finalize_workflow(
wic_wf,
parameters = wic_best
)
# fit to the training data and extract coefficients
wic_coefs <- wic_final %>%
fit(data = wic_train) %>%
extract_fit_parsnip() %>%
vip::vi(lambda = wic_best$penalty)
collect_metrics(wic_cv, summarize=TRUE)%>%
filter(.config == "Preprocessor1_Model1")
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 5143. 10 212. Preprocessor1_Model1
## 2 rsq standard 0.992 10 0.000722 Preprocessor1_Model1
#plot the RMSE across the 10 resamples
collect_metrics(wic_cv, summarize = FALSE) %>%
filter(.config == "Preprocessor1_Model1",
.metric == "rmse") %>%
ggplot(aes(id, .estimate, group = .metric)) +
geom_line() +
geom_point() +
scale_y_continuous() +
labs(title = "Calculated RMSE Across the 10 Folds",
y = "RMSE_hat") +
theme_minimal()
#fit to test data
wic_final_model <- fit(wic_wf, data = wic_test)
#make predictions
final_predictions <- bind_cols(
wic_models,
predict(wic_final_model, wic_models,
type="numeric"))
print(final_predictions)
## # A tibble: 33,302 × 6
## year month number_of_participants_rede… number_vouchers… average_cost .pred
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2010 1 3698 15460 59.1 13086.
## 2 2010 2 3674 15292 58.3 13036.
## 3 2010 3 3728 15506 58.1 13234.
## 4 2010 4 3625 15232 59.1 12795.
## 5 2010 5 3662 15282 58.7 12944.
## 6 2010 6 3703 15519 58.6 13089.
## 7 2010 7 3713 15474 58.5 13124.
## 8 2010 8 3671 15426 60.0 12873.
## 9 2010 9 3708 15430 59.8 13012.
## 10 2010 10 3703 15308 60.4 12943.
## # … with 33,292 more rows
#----------
#Lasso regression
#initial split
#set.seed(20211101)
#lasso_split <- initial_split(data = wic_models, prop = 0.75)
# create the training and testing data
#lasso_train <- training(x = lasso_split)
#lasso_test <- testing(x = lasso_split)
#folds
#l_folds <- vfold_cv(data = lasso_train, v = 10)
#Create wic recipe
#wic_rec <- recipe(average_cost ~., data = wic_train) %>%
# step_nzv(all_predictors()) %>%
# step_normalize(all_predictors()) %>%
# step_indicate_na()
#tuning grid for lasso
#lasso_grid <- grid_regular(penalty(), levels = 10)
#lasso mod
#lasso_mod <-
# linear_reg(
# penalty = tune(),
# mixture = 1
# ) %>%
# 3 set_engine(engine = "glmnet")
#lasso workflow
#lasso_wf <-
# workflow() %>%
# add_model(spec = lasso_mod) %>%
# add_recipe(recipe = wic_rec)
#hyperparameter tuning
#lasso_cv <- lasso_wf %>%
# tune_grid(
# resamples = folds,
# grid = lasso_grid)
# select the best model based on the "rmse" metric
#lasso_best <- lasso_cv %>%
# select_best(metric = "rmse")
# use the finalize_workflow() function with your lasso workflow and the best model
# to update (or "finalize") your workflow by modifying the line below
#lasso_final <- finalize_workflow(
# lasso_wf,
# parameters = lasso_best)
# fit to the training data and extract coefficients
#lasso_coefs <- lasso_final %>%
# fit(data = chi_train) %>%
# extract_fit_parsnip() %>%
# vip::vi(lambda = lasso_best$penalty)
#collect_metrics(lasso_cv, summarize=TRUE) %>%
# filter(.config == "Preprocessor1_Model10")
#plot the RMSE across the 10 resamples
#collect_metrics(lasso_cv, summarize = FALSE) %>%
#filter(.config == "Preprocessor1_Model10",
# .metric == "rmse") %>%
#ggplot(aes(id, .estimate, group = .metric)) +
#geom_line() +
#geom_point() +
#scale_y_continuous() +
#labs(title = "Calculated RMSE Across the 10 Folds",
#y = "RMSE_hat") +
#theme_minimal()
# mutate(across(starts_with("average"), ~gsub("\\$", "", .) %>% as.numeric)) %>%
# mutate(across(starts_with("total"), ~gsub("\\$", "", .) %>% as.numeric))
# select(participant_category, number_of_participants_redeemed, number_vouchers_redeemed)
#wic_models$total_cost_vouchers = as.numeric(gsub("\\$", "", wic_models$total_cost_vouchers))
** Add to interpretation: Squares that are perfectly white tell us that there is no correlation between cost and year, indicating that the cost is consistent across years. Number of participants and number of vouchers are perfectly correlated, indicating that there is not a difference across people in the number of vouchers that participants receive.
library(corrplot)
## corrplot 0.92 loaded
#create numeric data for PCA analysis
wic_numeric <- wic_part %>%
filter(COUNTY == "STATEWIDE") %>%
select(number_of_participants_redeemed, number_vouchers_redeemed, average_cost) %>%
mutate(part_type = case_when(
participant_category == "Breastfeeding Mother" ~ "1",
participant_category == "Child" ~ "2",
participant_category == "Infant" ~ "3",
participant_category == "Non-Breastfeeding Mother" ~ "4",
participant_category == "Prenatal" ~ "5"
)) %>%
ungroup() %>%
select(-participant_category, -COUNTY) %>%
mutate(part_type = as.numeric(part_type)) %>%
rename(n_vouchers = number_vouchers_redeemed) %>%
rename(n_part = number_of_participants_redeemed)
## Adding missing grouping variables: `year`, `COUNTY`, `participant_category`
correlation_plot <- cor(wic_numeric)
# creates a correlation plot
corrplot(correlation_plot, method = "shade")
wic_state <- wic_part %>%
filter(COUNTY == "STATEWIDE") %>%
select(number_of_participants_redeemed, number_vouchers_redeemed, average_cost) %>%
mutate(part_type = case_when(
participant_category == "Breastfeeding Mother" ~ "1",
participant_category == "Child" ~ "2",
participant_category == "Infant" ~ "3",
participant_category == "Non-Breastfeeding Mother" ~ "4",
participant_category == "Prenatal" ~ "5"
)) %>%
ungroup() %>%
mutate(part_type = as.numeric(part_type)) %>%
rename(n_vouchers = number_vouchers_redeemed) %>%
rename(n_part = number_of_participants_redeemed)
## Adding missing grouping variables: `year`, `COUNTY`, `participant_category`
wic_pca_numeric <- wic_state %>%
select_if(is.numeric)
#run PCA
pca_wic <- prcomp(wic_pca_numeric)
#extract principle components
wic_pcs <- pca_wic %>%
.$x %>%
as_tibble()
#combine pcs to county and participant category
wic_pcs <- bind_cols(
select(wic_state, COUNTY, participant_category),
wic_pcs
)
library(cluster)
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:dials':
##
## prune
## The following object is masked from 'package:stats':
##
## cutree
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(recipes)
# create a recipe with no outcome variable and all predictors
wic_pca_rec <- recipe(~., data = wic_numeric) %>%
# center and scale all predictors
step_center(all_predictors()) %>%
step_scale(all_predictors()) %>%
# run prep to prepare recipe
prep()
# apply recipe to employment_rec
wic_clust <- wic_pca_rec %>%
bake(new_data = NULL)
# PCA ---------------------------------------------------------------------
# create a correlation matrix on employment_clust
cor(wic_clust)
## year n_part n_vouchers average_cost part_type
## year 1.000000e+00 -0.13902219 -0.1380343 0.009404715 -3.218411e-21
## n_part -1.390222e-01 1.00000000 0.9934968 -0.065260541 -3.320404e-01
## n_vouchers -1.380343e-01 0.99349678 1.0000000 -0.160118546 -3.460947e-01
## average_cost 9.404715e-03 -0.06526054 -0.1601185 1.000000000 -8.351521e-02
## part_type -3.218411e-21 -0.33204042 -0.3460947 -0.083515208 1.000000e+00
# conduct PCA on the employment_clust data
principal_components <- prcomp(wic_clust)
# obtain summary metrics
summary(principal_components)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.4919 1.0353 0.9914 0.8472 0.03944
## Proportion of Variance 0.4452 0.2144 0.1966 0.1436 0.00031
## Cumulative Proportion 0.4452 0.6596 0.8561 0.9997 1.00000
# obtain loadings
principal_components$rotation
## PC1 PC2 PC3 PC4 PC5
## year 0.1475834 -0.28036319 0.91421762 -0.2526286 0.0007505649
## n_part -0.6465852 0.01275225 0.02385580 -0.3076241 -0.6975408768
## n_vouchers -0.6528909 0.07800267 0.06358958 -0.2357423 0.7127629379
## average_cost 0.0967908 -0.81097916 -0.38223385 -0.4264704 0.0704600924
## part_type 0.3528623 0.50740738 -0.11614946 -0.7772353 0.0209888458
# obtain component values for each observation
pca_data <- as_tibble(principal_components$x) %>%
select(PC1, PC2, PC3)
# set a seed because the clusters are not deterministic
set.seed(20200205)
# total within sum of squares
fviz_nbclust(wic_clust, FUN = kmeans, method = "wss")
# total silhouette width
fviz_nbclust(wic_clust, FUN = kmeans, method = "silhouette")
# gap statistic
fviz_nbclust(wic_clust, FUN = kmeans, method = "gap_stat")
# run kmeans with the optimal number of clusters using the employment_clust data, set nstart = 100
clust_kmeans <- kmeans(
wic_clust,
centers = 6,
nstart = 100
)
# examine the cluster means
tidy(clust_kmeans)
## # A tibble: 6 × 8
## year n_part n_vouchers average_cost part_type size withinss cluster
## <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <fct>
## 1 2.06e-17 -0.637 -0.530 -0.196 -1.41 108 109. 1
## 2 9.67e- 1 1.45 1.46 -0.569 -0.706 48 14.6 2
## 3 -7.74e- 1 2.26 2.31 -0.582 -0.706 60 21.0 3
## 4 -7.74e- 1 -0.554 -0.520 -0.584 1.06 120 61.3 4
## 5 2.06e-17 -0.0535 -0.263 1.95 0 108 111. 5
## 6 9.67e- 1 -0.668 -0.629 -0.599 1.06 96 36.1 6
#fit with 6 clusters
wic_kmeans6 <- kmeans(
wic_pca_numeric,
centers = 6,
nstart = 100
)
bind_cols(
select(wic_state, COUNTY, participant_category),
cluster = wic_kmeans6$cluster
) %>%
count(participant_category, cluster)
## # A tibble: 11 × 3
## participant_category cluster n
## <chr> <int> <int>
## 1 Breastfeeding Mother 4 87
## 2 Breastfeeding Mother 6 21
## 3 Child 1 56
## 4 Child 2 25
## 5 Child 3 27
## 6 Infant 4 3
## 7 Infant 5 105
## 8 Non-Breastfeeding Mother 6 108
## 9 Prenatal 4 63
## 10 Prenatal 5 40
## 11 Prenatal 6 5
wic_clusters <- bind_cols(
select(wic_state, COUNTY, participant_category),
select(wic_pcs, PC1, PC2, PC3, PC4, PC5),
cluster6 = wic_kmeans6$cluster)
ggplot() +
geom_point(
data = wic_clusters,
mapping = aes(PC1, PC2, color = factor(cluster6)),
alpha = 0.5) +
theme_minimal()
library(tidytext)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dials':
##
## degree, neighbors
## The following object is masked from 'package:tigris':
##
## blocks
## The following objects are masked from 'package:lubridate':
##
## %--%, union
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
## Registered S3 method overwritten by 'ggforce':
## method from
## scale_type.units units
#use retailers data that was cleaned earlier
retailers_text <- retailers %>%
filter(!is.na(store_name)) %>%
select(-zip4)
#creating biagram
bigram_snap <- retailers %>%
unnest_tokens(bigram, store_name, token = "ngrams", n = 2)
#separating the bigram into two columns
bigrams_separated <- bigram_snap %>%
separate(bigram, c("word1", "word2"), sep = " ")
#filtering out rows without stopwords
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word)
#counting number of appearances of each bigram and filtering the rows
bigram_30 <- bigrams_filtered %>%
count(word1, word2, sort = TRUE) %>%
filter(n > 30) %>%
filter(!is.na(word1))
# plot the bigrams that exist more than 30 times
bigram_graph <- bigram_30 %>%
graph_from_data_frame()
# plot the relationships
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
# merging to a single column
combined1 <- bigrams_filtered %>%
mutate(bigram = paste(word1,word2))
#-------------------------
#repeat geom_col for the 6 largest urban counties, 6 largest rural/suburban, and 6 smallest rural
#create stopword vectors for city names that don't tell us anything about the name/type of store
urb_city_names <- c("santa", "clara", "ana", "san", "jacinto", "monica", "los", "angeles", "pedro", "monte",
"palma", "chula", "vista", "cajon", "quinta", "jose", "yucca", "huntington", "diego", "ysidro",
"costa", "mesa", "lucerne", "grove")
rural_city_names <- c("santa", "fe", "central", "wheeler")
# filtering for the 6 largest urban counties
tf_idf_urb <- combined1 %>%
filter(COUNTY == "LOS ANGELES" | COUNTY == "SAN DIEGO" | COUNTY == "ORANGE" | COUNTY == "RIVERSIDE" | COUNTY == "SAN BERNARDINO" | COUNTY == "SANTA CLARA") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word1 %in% urb_city_names,
!word2 %in% urb_city_names,
!is.na(word1), !is.na(word2)) %>%
count(COUNTY, bigram) %>%
bind_tf_idf(bigram, COUNTY, n) %>%
#plotting the 10 most frequent
group_by(COUNTY) %>%
slice_max(order_by=tf_idf, n=10) %>%
mutate(bigram=reorder(bigram, tf_idf)) %>%
ggplot() +
geom_col(aes(tf_idf, bigram, fill=COUNTY)) +
facet_wrap(~COUNTY, scales="free")+
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
guides(fill="none")
tf_idf_urb
# - - - - - - - - - -
#filter for 6 largest rural counties
tf_idf_rur_high <- combined1 %>%
filter(COUNTY == "KERN" | COUNTY == "STANISLAUS" | COUNTY == "SONOMA" | COUNTY == "TULARE" | COUNTY == "SOLANO" | COUNTY == "MONTEREY") %>% #chose not to use Santa Barbara because it did not feel reflective of rural
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word1 %in% rural_city_names,
!word2 %in% rural_city_names,
!is.na(word1), !is.na(word2)) %>%
count(COUNTY, bigram) %>%
bind_tf_idf(bigram, COUNTY, n) %>%
# plotting the 5 most frequent store names
group_by(COUNTY) %>%
slice_max(order_by=tf_idf, n=5) %>%
mutate(bigram=reorder(bigram, tf_idf)) %>%
ggplot(aes(tf_idf, bigram, fill=COUNTY)) +
geom_col() +
facet_wrap(~COUNTY, scales="free")+
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
guides(fill="none") +
scale_x_continuous()
tf_idf_rur_high
# - - - - - - - - - -
#filter for 6 smallest rural counties
tf_idf_rur_low <- combined1 %>%
filter(COUNTY == "ALPINE" | COUNTY == "SIERRA" | COUNTY == "MODOC" | COUNTY == "AMADOR" | COUNTY == "MONO" | COUNTY == "MARIPOSA") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
count(COUNTY, bigram) %>%
bind_tf_idf(bigram, COUNTY, n) %>%
# plotting the 5 most frequent store names
group_by(COUNTY) %>%
slice_max(order_by=tf_idf, n=5) %>%
mutate(bigram=reorder(bigram, tf_idf)) %>%
ggplot(aes(tf_idf, bigram, fill=COUNTY)) +
geom_col() +
facet_wrap(~COUNTY, scales="free")+
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
guides(fill="none")
tf_idf_rur_low